#coding=utf-8

#urllib模块提供了读取Web页面数据的接
import urllib.request
import os
#re模块主要包含了正则表达式
import re
from selenium import webdriver
# from selenium import webdriver
# path = "D:\chromedriver\chromedriver.exe"
# browser = webdriver.Chrome(executable_path=path)
# browser.get('http://www.baidu.com')
# ------ 获取网页源代码的方法 ---
def getHtml(url):
    page = urllib.request.Request(url)
    page.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
                                  '(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
    page = urllib.request.urlopen(url)
    html = page.read()
    return html

# ------ getHtml()内输入任意帖子的URL ------
html = getHtml("https://neihanshequ.com/")
# ------ 修改html对象内的字符编码为UTF-8 ------
html = html.decode('UTF-8')
print(html)

# ------ 获取帖子内所有的方法 ------
def getText(html):
   # ------ 利用正则表达式匹配网页内容找到wenzi ------
     reg = r'<div class="detail-wrapper">.*?<h1.*?p>(.*?)</p>'

     pattern= re.compile(reg,re.S);
     result = re.findall(pattern, html)
     return result
pag = getText(html)
print(pag)

#print(imgList)
# imgName = 0
# os.mkdir("D:/tieba/'+ingName+'")
# for imgPath in imgList:
#      # ------ 这里最好使用异常处理及多线程编程方式 ------
#
#         f = open("D:/tieba/07/"+str(imgName)+".jpg", 'wb')
#         f.write((urllib.request.urlopen(imgPath)).read())
#         f.close()
#         imgName += 1
# print("All Done!")